COMPACTNESS (average perim)**2/area
CIRCULARITY (average radius)**2/area
DISTANCE CIRCULARITY area/(av.distance from border)**2
RADIUS RATIO (max.rad-min.rad)/av.radius
PR.AXIS ASPECT RATIO (minor axis)/(major axis)
MAX.LENGTH ASPECT RATIO (length perp. max length)/(max length)
SCATTER RATIO (inertia about minor axis)/(inertia about major axis)
ELONGATEDNESS area/(shrink width)**2
PR.AXIS RECTANGULARITY area/(pr.axis length*pr.axis width)
MAX.LENGTH RECTANGULARITY area/(max.length*length perp. to this)
SCALED VARIANCE (2nd order moment about minor axis)/area ALONG MAJOR AXIS
SCALED VARIANCE.1 (2nd order moment about major axis)/area ALONG MINOR AXIS
SCALED RADIUS OF GYRATION (mavar+mivar)/area
SCALED RADIUS OF GYRATION.1
SKEWNESS ABOUT
SKEWNESS ABOUT1 (3rd order moment about major axis)/sigma_min**3 MAJOR AXIS
SKEWNESS ABOUT2 (3rd order moment about minor axis)/sigma_maj**3 MINOR AXIS
HOLLOWS RATIO (area of hollows)/(area of bounding polygon)
CLASS (CLASS OF VEHICLE: CAR, VAN, BUS)
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score
import sys
import warnings
if not sys.warnoptions:
warnings.simplefilter("ignore")
#Loading the dataset and storing in dataframe
vehicle_df = pd.read_csv('vehicle-2.csv')
vehicle_df.head()
#Checking Width and Depth of the Data
vehicle_df.shape
#Finding Datatypes of Each Attribute in the data
vehicle_df.dtypes
#Analysing the Class Counts on basis of target variable: 'class' of vehicle
vehicle_df['class'].value_counts()
#Checking and Analysing Object Type Data
vehicle_df.describe().transpose()
#Checking for NUll Values in the dataframe..
vehicle_df.isnull().any()
Following Attributes have Null Vlues in dataframe.
#Plotting Pie Chart and Count Plot for Term Deposit
figure,ax = plt.subplots(1,2,figsize=(15,7))
labels =['Car', 'Bus', 'Van']
ax[0].set_title("Deposit Distribution Pie")
vehicle_df['class'].value_counts().plot.pie(ax = ax[0], labels =['Car', 'Bus', 'Van'], autopct='%1.1f%%', colors = ['#176BB8', '#F6E708', '#ED1515'])
plt.title('Vehicle Class Index')
plt.legend(labels,loc=3)
sns.countplot(vehicle_df['class'], ax = ax[1], palette = ['#ED1515', '#176BB8', '#F6E708'])
ax[1].set_xticklabels(['Van', 'Car', 'Bus'])
vehicle_df['class'].value_counts()
#Object type Data cannot be handled by Algorithms. Changing the target variable type to 'Category'.
#Mapping the yes and no values to '0' and '1'
vehicle_df['class'] = vehicle_df['class'].astype('category')
vehicle_df['class'] = vehicle_df['class'].replace(['car'], 1)
vehicle_df['class'] = vehicle_df['class'].replace(['bus'], 2)
vehicle_df['class'] = vehicle_df['class'].replace(['van'], 3)
vehicle_df['class'].value_counts()
# Defining Features and Target:
vehiclef_df = vehicle_df.drop(["class"],axis=1)
class_df = vehicle_df["class"]
vehiclef_df.isnull().sum()
!pip install missingno
import missingno as msno
msno.bar(vehiclef_df)
14 columns out of 18 features have missing values. Total no. of value each column has is indicated at the top of each bar. We need to address these missing values
# Handling Nulls: Replacing Nulls by mean value
def remove_nulls(df_in, col_name):
df_in[col_name].fillna((df_in[col_name].mean()), inplace=True)
return df_in
clean_vehicle = remove_nulls(vehiclef_df, 'compactness')
clean_vehicle = remove_nulls(vehiclef_df, 'circularity')
clean_vehicle = remove_nulls(vehiclef_df, 'distance_circularity')
clean_vehicle = remove_nulls(vehiclef_df, 'radius_ratio')
clean_vehicle = remove_nulls(vehiclef_df, 'pr.axis_aspect_ratio')
clean_vehicle = remove_nulls(vehiclef_df, 'scatter_ratio')
clean_vehicle = remove_nulls(vehiclef_df, 'elongatedness')
clean_vehicle = remove_nulls(vehiclef_df, 'pr.axis_rectangularity')
clean_vehicle = remove_nulls(vehiclef_df, 'max.length_rectangularity')
clean_vehicle = remove_nulls(vehiclef_df, 'scaled_variance')
clean_vehicle = remove_nulls(vehiclef_df, 'scaled_variance.1')
clean_vehicle = remove_nulls(vehiclef_df, 'scaled_radius_of_gyration')
clean_vehicle = remove_nulls(vehiclef_df, 'scaled_radius_of_gyration.1')
clean_vehicle = remove_nulls(vehiclef_df, 'skewness_about')
clean_vehicle = remove_nulls(vehiclef_df, 'skewness_about.1')
clean_vehicle = remove_nulls(vehiclef_df, 'skewness_about.2')
clean_vehicle = remove_nulls(vehiclef_df, 'hollows_ratio')
clean_vehicle.isnull().sum().sum()
Null Values treated
# Plotting the Columns with Outliers
plt.figure(figsize = (15, 15))
boxplot = vehiclef_df.boxplot(column=['radius_ratio', 'pr.axis_aspect_ratio', 'max.length_aspect_ratio', 'scaled_variance', 'scaled_variance.1', 'scaled_radius_of_gyration.1', 'skewness_about'], grid = False)
# Handling Outliers using IQR method
def handle_outlier(df_in, col_name):
q1 = df_in[col_name].quantile(0.25)
q3 = df_in[col_name].quantile(0.75)
iqr = q3-q1 #Interquartile range
fence_low = q1-1.5*iqr
fence_high = q3+1.5*iqr
print('IQR lower bound and upper bound of', col_name, 'is', fence_low, 'and', fence_high, 'respectively')
df_in.loc[df_in[col_name] > fence_high, col_name] = df_in[col_name].median()
return df_in
clean_vehicle2 = handle_outlier(clean_vehicle, 'compactness')
clean_vehicle2 = handle_outlier(clean_vehicle, 'circularity')
clean_vehicle2 = handle_outlier(clean_vehicle, 'distance_circularity')
clean_vehicle2 = handle_outlier(clean_vehicle, 'radius_ratio')
clean_vehicle2 = handle_outlier(clean_vehicle, 'pr.axis_aspect_ratio')
clean_vehicle2 = handle_outlier(clean_vehicle, 'max.length_aspect_ratio')
clean_vehicle2 = handle_outlier(clean_vehicle, 'scatter_ratio')
clean_vehicle2 = handle_outlier(clean_vehicle, 'elongatedness')
clean_vehicle2 = handle_outlier(clean_vehicle, 'pr.axis_rectangularity')
clean_vehicle2 = handle_outlier(clean_vehicle, 'max.length_rectangularity')
clean_vehicle2 = handle_outlier(clean_vehicle, 'scaled_variance')
clean_vehicle2 = handle_outlier(clean_vehicle, 'scaled_variance.1')
clean_vehicle2 = handle_outlier(clean_vehicle, 'scaled_radius_of_gyration')
clean_vehicle2 = handle_outlier(clean_vehicle, 'scaled_radius_of_gyration.1')
clean_vehicle2 = handle_outlier(clean_vehicle, 'skewness_about')
clean_vehicle2 = handle_outlier(clean_vehicle, 'skewness_about.1')
clean_vehicle2 = handle_outlier(clean_vehicle, 'skewness_about.2')
clean_vehicle2 = handle_outlier(clean_vehicle, 'hollows_ratio')
print('outliers handled successfully')
clean_vehicle2.head()
# Defining Features and Target:
feature = clean_vehicle2
target = vehicle_df["class"]
#Plotting Features HeatMap
corr = feature.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
plt.figure(figsize = (12, 12))
sns.heatmap(corr, mask = mask,annot = True, square = True)
feature_target_corr = feature.join(target).corr()
mask = np.zeros((19, 19))
mask[:18, :] = 1
plt.figure(figsize=(15, 8))
with sns.axes_style("white"):
sns.heatmap(feature_target_corr, annot=True, mask = mask)
Only elongatedness i s significantly correlated to 'class' of vehicle i.e. our target variable.
# concatenating features from cleaned dataframe and target column i.e. class from original dataframe
complete_data = clean_vehicle2
complete_data['class'] = vehicle_df['class']
complete_data.shape
col_names = ['compactness', 'circularity', 'distance_circularity', 'radius_ratio', 'pr.axis_aspect_ratio', 'max.length_aspect_ratio', 'scatter_ratio', 'elongatedness', 'pr.axis_rectangularity', 'max.length_rectangularity', 'scaled_variance', 'scaled_variance.1', 'scaled_radius_of_gyration', 'scaled_radius_of_gyration.1', 'skewness_about', 'skewness_about.1', 'skewness_about.2', 'hollows_ratio']
fig, ax = plt.subplots(len(col_names), figsize=(15, 20))
for i, col_val in enumerate(col_names):
sns.distplot(clean_vehicle2[col_val], hist=True, ax=ax[i])
ax[i].set_title('Freq dist '+col_val, fontsize=10)
ax[i].set_xlabel(col_val, fontsize=8)
ax[i].set_ylabel('Count', fontsize=8)
plt.show()
#Understanding Distribution over 'Circularity Attribute'
clean_vehicle2['circularity'].describe().transpose()
#Plotting Distribution
fig, ax = plt.subplots()
fig.set_size_inches(25, 5)
sns.countplot(x = 'circularity', palette="rocket", data = complete_data)
ax.set_xlabel('circualrity', fontsize=25)
ax.set_ylabel('Count', fontsize=25)
ax.set_title('circularity', fontsize=25)
sns.despine()
#Plotting Dist Plot of Circularity Attribute
fig,ax = plt.subplots()
fig.set_size_inches(30, 12)
sns.distplot(clean_vehicle2['circularity'], hist = True, kde = True,
bins=int(180/5), color = 'blue',
hist_kws={'edgecolor':'black'})
ax.set_xlabel('circularity', fontsize=15)
sns.FacetGrid(complete_data, hue = 'class', size = 10).map(sns.distplot, 'circularity').add_legend()
sns.jointplot(x = "circularity", y = "max.length_rectangularity", data = clean_vehicle2);
f, ax = plt.subplots(figsize = (10, 10))
cmap = sns.cubehelix_palette(as_cmap = True, dark = 0, light = 1, reverse = True)
sns.kdeplot(clean_vehicle2['circularity'], clean_vehicle2['max.length_rectangularity'], cmap = cmap, n_levels = 60, shade=True)
sns.relplot( x = "circularity", y = "elongatedness", data = clean_vehicle2);
clean_vehicle2['distance_circularity'].describe()
fig, ax = plt.subplots()
fig.set_size_inches(25, 5)
sns.countplot(x = 'distance_circularity', palette="rocket", data = complete_data)
ax.set_xlabel('distance_circualrity', fontsize=25)
ax.set_ylabel('Count', fontsize=25)
ax.set_title('distance_circularity', fontsize=25)
sns.despine()
#Plotting Dist Plot of Age Attribute
fig,ax = plt.subplots()
fig.set_size_inches(30, 12)
sns.distplot(clean_vehicle2['distance_circularity'], hist = True, kde = True,
bins=int(180/5), color = 'blue',
hist_kws={'edgecolor':'black'})
ax.set_xlabel('distance_circularity', fontsize=15)
sns.FacetGrid(complete_data, hue = 'class', size = 10).map(sns.distplot, 'distance_circularity').add_legend()
f = plt.figure()
f, axes = plt.subplots(nrows = 2, ncols = 2, sharex=True, sharey = True)
f.set_size_inches(15, 15)
axes[0][0].scatter(x = "distance_circularity", y = "scaled_variance", data = clean_vehicle2)
axes[0][0].set_xlabel('distance_circularity', labelpad = 5)
axes[0][0].set_ylabel('scaled_variance', labelpad = 5)
axes[0][1].scatter(x = "distance_circularity", y = "scaled_variance.1", data = clean_vehicle2)
axes[0][1].set_xlabel('distance_circularity', labelpad = 5)
axes[0][1].set_ylabel('scaled_variance.1', labelpad = 5)
axes[1][0].scatter(x = "distance_circularity", y = "pr.axis_rectangularity", data = clean_vehicle2)
axes[1][0].set_xlabel('distance_circularity')
axes[1][0].set_ylabel('pr.axis_rectangularity', labelpad = 5)
axes[1][1].scatter(x = "distance_circularity", y = "elongatedness", data = clean_vehicle2)
axes[1][1].set_xlabel('distance_circularity')
axes[1][1].set_ylabel('elongatedness', labelpad = 5)
clean_vehicle2['scatter_ratio'].describe()
#Plotting Dist Plot of Age Attribute
fig,ax = plt.subplots()
fig.set_size_inches(30, 12)
sns.distplot(clean_vehicle2['scatter_ratio'], hist = True, kde = True,
bins=int(180/5), color = 'blue',
hist_kws={'edgecolor':'black'})
ax.set_xlabel('scatter_ratio', fontsize=15)
sns.relplot( x = "scatter_ratio", y = "elongatedness", data = clean_vehicle2);
sns.relplot( x = "scatter_ratio", y = "scaled_variance", data = clean_vehicle2);
sns.relplot( x = "scatter_ratio", y = "pr.axis_rectangularity", data = clean_vehicle2);
clean_vehicle2['elongatedness'].describe()
fig, ax = plt.subplots()
fig.set_size_inches(25, 5)
sns.countplot(x = 'elongatedness', palette="rocket", data = complete_data)
ax.set_xlabel('elongatedness', fontsize=25)
ax.set_ylabel('Count', fontsize=25)
ax.set_title('elongatedness', fontsize=25)
sns.despine()
#Plotting Dist Plot of Age Attribute
fig,ax = plt.subplots()
fig.set_size_inches(30, 12)
sns.distplot(clean_vehicle2['elongatedness'], hist = True, kde = True,
bins=int(180/5), color = 'blue',
hist_kws={'edgecolor':'black'})
ax.set_xlabel('elongatedness', fontsize=15)
sns.FacetGrid(complete_data, hue = 'class', size = 10).map(sns.distplot, 'elongatedness').add_legend()
sns.relplot( x = "elongatedness", y = "scaled_variance.1", data = clean_vehicle2);
sns.relplot( x = "elongatedness", y = "scaled_variance", data = clean_vehicle2);
sns.relplot( x = "elongatedness", y = "pr.axis_rectangularity", data = clean_vehicle2);
clean_vehicle2['pr.axis_rectangularity'].describe()
fig, ax = plt.subplots()
fig.set_size_inches(25, 5)
sns.countplot(x = 'pr.axis_rectangularity', palette="rocket", data = complete_data)
ax.set_xlabel('pr.axis_rectangularity', fontsize=25)
ax.set_ylabel('Count', fontsize=25)
ax.set_title('pr.axis_rectangularity', fontsize=25)
sns.despine()
#Plotting Dist Plot of Age Attribute
fig,ax = plt.subplots()
fig.set_size_inches(30, 12)
sns.distplot(clean_vehicle2['pr.axis_rectangularity'], hist = True, kde = True,
bins=int(180/5), color = 'blue',
hist_kws={'edgecolor':'black'})
ax.set_xlabel('pr.axis_rectangularity', fontsize=15)
sns.FacetGrid(complete_data, hue = 'class', size = 10).map(sns.distplot, 'pr.axis_rectangularity').add_legend()
sns.relplot( x = "pr.axis_rectangularity", y = "scaled_variance", data = clean_vehicle2);
sns.relplot( x = "pr.axis_rectangularity", y = "scaled_variance.1", data = clean_vehicle2);
from scipy.stats import zscore
XScaled = clean_vehicle2.apply(zscore)
XScaled.head()
Here we are having 18 features and 846 records. We are likely to have the Curse of Dimensionality.
Statistical tests can be used to select those features that have the strongest relationship with the output variable.
The scikit-learn library provides the "SelectKBest class" that can be used with a suite of different statistical tests to select a specific number of features.
So, I am using chi-squared (chi²) statistical test to select 15 of the best features from the Dataset.
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
X = clean_vehicle2 #independent columns
y = vehicle_df["class"] #target column
#apply SelectKBest class to extract top 12 best features
bestfeatures = SelectKBest(score_func = chi2, k = 12)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score'] #naming the dataframe columns
print(featureScores.nlargest(12,'Score')) #print 12 best features
new_vehicle = clean_vehicle2.drop(columns =['circularity', 'pr.axis_aspect_ratio', 'pr.axis_rectangularity', 'max.length_rectangularity', 'skewness_about.2', 'hollows_ratio'])
XScaled_new = new_vehicle.apply(zscore)
XScaled_new.head()
#Visually inspect the covariance between independent dimensions
sns.pairplot(XScaled, diag_kind='kde')
from sklearn.decomposition import PCA
pca = PCA().fit(XScaled)
#Plotting the Cumulative Summation of the Explained Variance
plt.figure(figsize=(10, 10))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Pulsar Dataset Explained Variance')
plt.show()
pca = PCA(n_components = 7, random_state = 1)
dataset = pca.fit_transform(XScaled)
print(pca.explained_variance_)
print(pca.components_)
print('-----------------------------------------------------------------')
print(pca.explained_variance_ratio_)
Xpca7 = pca.transform(XScaled)
print(Xpca7)
sns.pairplot(pd.DataFrame(Xpca12))
X = XScaled_new
y = vehicle_df["class"]
svc_clf = SVC()
svc_clf.fit(X, y)
svc_score = svc_clf.score(X, y)
print('Overall Acuracy: ', svc_score)
X1 = Xpca7
y = vehicle_df["class"]
svc_clf = SVC()
svc_clf.fit(X1, y)
svc_score = svc_clf.score(X1, y)
print('Overall Acuracy: ', svc_score)
from sklearn.model_selection import GridSearchCV
param_grid = {'C':[0.01, 0.05, 0.5, 1], 'kernel':['linear','rbf']}
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 2)
# Fitting the Tuned Model
grid.fit(X1, y)
grid_score = grid.score(X1, y)
print('Overall Acuracy: ', grid_score)
grid_scores = cross_val_score(grid, X1, y, cv = 3)
print('Cross Val Score: ', grid_scores.mean())
After implementing the PCA on your dataset, all the Principal Components are independent of one another. There is no correlation among them.
So, if the input dimensions are too high, use of PCA speeds up the algorithm.
PCA Reduces Overfitting: Overfitting mainly occurs when there are too many variables in the dataset. So, PCA helps in overcoming the overfitting issue by reducing the number of features.
PCA Improves Visualization: It is very hard to visualize and understand the data in high dimensions. PCA transforms a high dimensional data to low dimensional data so that it can be visualized easily.